import pandas as pd
from datetime import datetime

c = 1000

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))

df = df[['doc','narrative']]
temp = df.to_dict('records')

for d in temp:
    d['narrative'] = '_'.join(d['narrative'].split(' '))

df = pd.DataFrame(temp)

temp = df.groupby(['doc'])['narrative'].apply(lambda x: ' '.join(x)).reset_index()

# Narrative embeddings (skipgram model)
import time
import gensim

print('Training narrative embeddings...')

window_size = 40
vector_size = 50
n_iterations = 10
n_workers = 1

corpus = list(temp['narrative'])
corpus = [str(sentence).split() for sentence in corpus]

t0 = time.time()

model = gensim.models.Word2Vec(
    sentences=corpus,
    size=vector_size,
    window=window_size,
    min_count=1,
    workers=n_workers,
    iter=n_iterations,
    seed=123
)

model.save('../models/narrative_embeddings.model')

print('Training word embeddings took:')
print(time.time() - t0)

l1 = model.most_similar('saddam_hussein_pose_threat', topn = 20)
l1 = [' '.join(l[0].split('_')) for l in l1]

import plotly
import numpy as np
import plotly.graph_objs as go
from sklearn.decomposition import PCA

labels = list(set(df['narrative'].sample(n=20, random_state=1)))

vecs = []
for label in labels:
    vecs.append(np.array([model[label]]))
vectors = np.concatenate(vecs)

two_dim = PCA(random_state=0).fit_transform(vectors)[:,:2]

dfplot = pd.DataFrame({"x":two_dim[:,0],"y":two_dim[:,1],"labels":labels})

import plotly.express as px
fig = px.scatter(
    dfplot,
    x="x",
    y="y",
    text="labels",
    width = 1250,
    height = 750,
    labels=dict(x="PCA (Dimension 1)", y="PCA (Dimension 2)", topic_manual_label="Topic")
)

fig.update_traces(textposition='top center')
fig.update_layout(plot_bgcolor='white')

fig.show()
fig.write_html('../figures/Figure_I_1.html')

# PMI
narratives = list(set(df['narrative']))
print(len(narratives))

temp = temp.to_dict('records')

coocs = []
for d in temp:
    docs = d['narrative'].split()
    ids = []
    for narr in docs:
        ids.append(narratives.index(narr))
    for i in ids:
        for j in ids:
            coocs.append((i,j))

n_nodes = len(narratives)
A = np.zeros((n_nodes,n_nodes))

for edge in coocs:
    i = int(edge[0])
    j = int(edge[1])
    A[i,j] += 1

for i in range(n_nodes):
    for j in range(n_nodes):
        A[i,j] = A[i,j]/(A[i,i]*A[j,j])

def largest_indices(ary, n):
    """Returns the n largest indices from a numpy array."""
    flat = ary.flatten()
    indices = np.argpartition(flat, -n)[-n:]
    indices = indices[np.argsort(-flat[indices])]
    return np.unravel_index(indices, ary.shape)

narr = 'saddam_hussein_pose_threat'

indices = largest_indices(A[:,narratives.index(narr)], 20)

l2 = []
for i in indices[0]:
    l2.append(' '.join(narratives[i].split('_')))

tab = pd.DataFrame([l1,l2]).transpose()
tab.to_latex('../tables/Table_I_1.tex', index=False)